CSE3020 DATA VISUALIZATION J-COMPONENT

Topic: COVID-19 based data visualization and analysis

Team: Soumik Kabiraj [20BCE1504], Mayank Yadav [20BCE1674]

Slot: D2



Importing necessary libraries

library(readr) 
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.0 
## ✔ tidyr   1.2.0      ✔ forcats 0.5.2 
## ✔ purrr   1.0.1
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(scales)
## Warning: package 'scales' was built under R version 4.2.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(openair) 
## Warning: package 'openair' was built under R version 4.2.3
library(corrplot) 
## Warning: package 'corrplot' was built under R version 4.2.2
## corrplot 0.92 loaded
library(caTools) 
## Warning: package 'caTools' was built under R version 4.2.2
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Reading datasets

confirmed <- read_csv("C:/Users/SOUMIK/Desktop/DV J-component/time_series_covid_19_confirmed.csv",col_types = cols(.default = col_double(), `Province/State` = col_character(), `Country/Region` = col_character()))

recovered <- read_csv(file = "C:/Users/SOUMIK/Desktop/DV J-component/time_series_covid_19_recovered.csv",col_types = cols(.default = col_double(),`Province/State` = col_character(), `Country/Region` = col_character()))

deaths <- read_csv(file = "C:/Users/SOUMIK/Desktop/DV J-component/time_series_covid_19_deaths.csv",col_types = cols(.default = col_double(),`Province/State` = col_character(), `Country/Region` = col_character()))

codes <- read_csv('C:/Users/SOUMIK/Desktop/DV J-component/2014_world_gdp_with_codes.csv',col_types = cols(COUNTRY = col_character(),`GDP (BILLIONS)` = col_double(),CODE = col_character()))

data <- read.csv("C:/Users/SOUMIK/Desktop/DV J-component/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2022.csv")

df <- read.csv("C:/Users/SOUMIK/Desktop/DV J-component/covid_19_india.csv")

dim(confirmed)
## [1] 276 498
dim(recovered)
## [1] 261 498
dim(deaths)
## [1] 276 498
dim(data)
## [1] 385020     14
dim(df)
## [1] 18110     9
head(confirmed)
## # A tibble: 6 × 498
##   Provinc…¹ Count…²   Lat   Long 1/22/…³ 1/23/…⁴ 1/24/…⁵ 1/25/…⁶ 1/26/…⁷ 1/27/…⁸
##   <chr>     <chr>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 <NA>      Afghan…  33.9  67.7        0       0       0       0       0       0
## 2 <NA>      Albania  41.2  20.2        0       0       0       0       0       0
## 3 <NA>      Algeria  28.0   1.66       0       0       0       0       0       0
## 4 <NA>      Andorra  42.5   1.52       0       0       0       0       0       0
## 5 <NA>      Angola  -11.2  17.9        0       0       0       0       0       0
## 6 <NA>      Antigu…  17.1 -61.8        0       0       0       0       0       0
## # … with 488 more variables: `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
## #   `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>,
## #   `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>,
## #   `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
## #   `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>,
## #   `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>,
## #   `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, …
## # ℹ Use `colnames()` to see all variable names
head(recovered)
## # A tibble: 6 × 498
##   Provinc…¹ Count…²   Lat   Long 1/22/…³ 1/23/…⁴ 1/24/…⁵ 1/25/…⁶ 1/26/…⁷ 1/27/…⁸
##   <chr>     <chr>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 <NA>      Afghan…  33.9  67.7        0       0       0       0       0       0
## 2 <NA>      Albania  41.2  20.2        0       0       0       0       0       0
## 3 <NA>      Algeria  28.0   1.66       0       0       0       0       0       0
## 4 <NA>      Andorra  42.5   1.52       0       0       0       0       0       0
## 5 <NA>      Angola  -11.2  17.9        0       0       0       0       0       0
## 6 <NA>      Antigu…  17.1 -61.8        0       0       0       0       0       0
## # … with 488 more variables: `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
## #   `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>,
## #   `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>,
## #   `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
## #   `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>,
## #   `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>,
## #   `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, …
## # ℹ Use `colnames()` to see all variable names
head(deaths)
## # A tibble: 6 × 498
##   Provinc…¹ Count…²   Lat   Long 1/22/…³ 1/23/…⁴ 1/24/…⁵ 1/25/…⁶ 1/26/…⁷ 1/27/…⁸
##   <chr>     <chr>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 <NA>      Afghan…  33.9  67.7        0       0       0       0       0       0
## 2 <NA>      Albania  41.2  20.2        0       0       0       0       0       0
## 3 <NA>      Algeria  28.0   1.66       0       0       0       0       0       0
## 4 <NA>      Andorra  42.5   1.52       0       0       0       0       0       0
## 5 <NA>      Angola  -11.2  17.9        0       0       0       0       0       0
## 6 <NA>      Antigu…  17.1 -61.8        0       0       0       0       0       0
## # … with 488 more variables: `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
## #   `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>,
## #   `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>,
## #   `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
## #   `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>,
## #   `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>,
## #   `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, …
## # ℹ Use `colnames()` to see all variable names
head(data)
##   Data.As.Of Start.Date   End.Date    Group Year Month         State
## 1 03/27/2022 01/01/2020 03/26/2022 By Total   NA    NA United States
## 2 03/27/2022 01/01/2020 03/26/2022 By Total   NA    NA United States
## 3 03/27/2022 01/01/2020 03/26/2022 By Total   NA    NA United States
## 4 03/27/2022 01/01/2020 03/26/2022 By Total   NA    NA United States
## 5 03/27/2022 01/01/2020 03/26/2022 By Total   NA    NA United States
## 6 03/27/2022 01/01/2020 03/26/2022 By Total   NA    NA United States
##        Condition.Group               Condition ICD10_codes Age.Group
## 1 Respiratory diseases Influenza and pneumonia     J09-J18      0-24
## 2 Respiratory diseases Influenza and pneumonia     J09-J18     25-34
## 3 Respiratory diseases Influenza and pneumonia     J09-J18     35-44
## 4 Respiratory diseases Influenza and pneumonia     J09-J18     45-54
## 5 Respiratory diseases Influenza and pneumonia     J09-J18     55-64
## 6 Respiratory diseases Influenza and pneumonia     J09-J18     65-74
##   COVID.19.Deaths Number.of.Mentions Flag
## 1            1284               1341     
## 2            5390               5591     
## 3           14166              14726     
## 4           35320              36658     
## 5           77221              79932     
## 6          118051             121479
head(df)
##   Sno       Date    Time State.UnionTerritory ConfirmedIndianNational
## 1   1 2020-01-30 6:00 PM               Kerala                       1
## 2   2 2020-01-31 6:00 PM               Kerala                       1
## 3   3 2020-02-01 6:00 PM               Kerala                       2
## 4   4 2020-02-02 6:00 PM               Kerala                       3
## 5   5 2020-02-03 6:00 PM               Kerala                       3
## 6   6 2020-02-04 6:00 PM               Kerala                       3
##   ConfirmedForeignNational Cured Deaths Confirmed
## 1                        0     0      0         1
## 2                        0     0      0         1
## 3                        0     0      0         2
## 4                        0     0      0         3
## 5                        0     0      0         3
## 6                        0     0      0         3

DATA PREPROCESSING

colnames(confirmed)[2]="Country"
colnames(recovered)[2]="Country"
colnames(deaths)[2]="Country"
confirmed <- confirmed %>%
  gather("Date", "Confirmed", -c("Province/State", "Country", "Lat", "Long")) %>%
  mutate(Date = as.Date(Date, "%m/%d/%y"))

recovered <- recovered %>%
  gather("Date", "Recovered", -c("Province/State", "Country", "Lat", "Long")) %>%
  mutate(Date = as.Date(Date, "%m/%d/%y"))

deaths <- deaths %>%
  gather("Date", "Deaths", -c("Province/State", "Country", "Lat", "Long")) %>%
  mutate(Date = as.Date(Date, "%m/%d/%y"))

head(confirmed)
## # A tibble: 6 × 6
##   `Province/State` Country               Lat   Long Date       Confirmed
##   <chr>            <chr>               <dbl>  <dbl> <date>         <dbl>
## 1 <NA>             Afghanistan          33.9  67.7  2020-01-22         0
## 2 <NA>             Albania              41.2  20.2  2020-01-22         0
## 3 <NA>             Algeria              28.0   1.66 2020-01-22         0
## 4 <NA>             Andorra              42.5   1.52 2020-01-22         0
## 5 <NA>             Angola              -11.2  17.9  2020-01-22         0
## 6 <NA>             Antigua and Barbuda  17.1 -61.8  2020-01-22         0
head(recovered)
## # A tibble: 6 × 6
##   `Province/State` Country               Lat   Long Date       Recovered
##   <chr>            <chr>               <dbl>  <dbl> <date>         <dbl>
## 1 <NA>             Afghanistan          33.9  67.7  2020-01-22         0
## 2 <NA>             Albania              41.2  20.2  2020-01-22         0
## 3 <NA>             Algeria              28.0   1.66 2020-01-22         0
## 4 <NA>             Andorra              42.5   1.52 2020-01-22         0
## 5 <NA>             Angola              -11.2  17.9  2020-01-22         0
## 6 <NA>             Antigua and Barbuda  17.1 -61.8  2020-01-22         0
head(deaths)
## # A tibble: 6 × 6
##   `Province/State` Country               Lat   Long Date       Deaths
##   <chr>            <chr>               <dbl>  <dbl> <date>      <dbl>
## 1 <NA>             Afghanistan          33.9  67.7  2020-01-22      0
## 2 <NA>             Albania              41.2  20.2  2020-01-22      0
## 3 <NA>             Algeria              28.0   1.66 2020-01-22      0
## 4 <NA>             Andorra              42.5   1.52 2020-01-22      0
## 5 <NA>             Angola              -11.2  17.9  2020-01-22      0
## 6 <NA>             Antigua and Barbuda  17.1 -61.8  2020-01-22      0
#Merging confirmed, recovered and deaths dataset into one dataset
ts_total <- confirmed %>%
  left_join(deaths) %>%
  left_join(recovered) %>%
  mutate(Recovered = replace_na(Recovered, replace = 0))
## Joining, by = c("Province/State", "Country", "Lat", "Long", "Date")
## Joining, by = c("Province/State", "Country", "Lat", "Long", "Date")
head(ts_total)
## # A tibble: 6 × 8
##   `Province/State` Country          Lat   Long Date       Confi…¹ Deaths Recov…²
##   <chr>            <chr>          <dbl>  <dbl> <date>       <dbl>  <dbl>   <dbl>
## 1 <NA>             Afghanistan     33.9  67.7  2020-01-22       0      0       0
## 2 <NA>             Albania         41.2  20.2  2020-01-22       0      0       0
## 3 <NA>             Algeria         28.0   1.66 2020-01-22       0      0       0
## 4 <NA>             Andorra         42.5   1.52 2020-01-22       0      0       0
## 5 <NA>             Angola         -11.2  17.9  2020-01-22       0      0       0
## 6 <NA>             Antigua and B…  17.1 -61.8  2020-01-22       0      0       0
## # … with abbreviated variable names ¹​Confirmed, ²​Recovered
dim(ts_total)
## [1] 136344      8
#We all know "Diamond Princess" and "MS Zaandam" are cruises, So we have to remove them from the data
ts_total <- ts_total %>%
  filter(Country != "Diamond Princess") %>%
  filter(Country != "MS Zaandam")
ts_total$Deaths[is.na(ts_total$Deaths)] <- 0
#Grouping by country, date and incorporating new cases
cases_latest <- ts_total %>%
  group_by(Country, Date) %>%
  summarise(Confirmed  = sum(Confirmed),Recovered = sum(Recovered),Deaths = sum(Deaths)) %>%
  mutate("New Cases" = Confirmed - lag(Confirmed, 1) ) %>%
  filter(Date == max(Date))
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
day_latest <- max(cases_latest$Date)

head(cases_latest)
## # A tibble: 6 × 6
## # Groups:   Country [6]
##   Country             Date       Confirmed Recovered Deaths `New Cases`
##   <chr>               <date>         <dbl>     <dbl>  <dbl>       <dbl>
## 1 Afghanistan         2021-05-29     70111     57281   2899         981
## 2 Albania             2021-05-29    132297    129215   2449          12
## 3 Algeria             2021-05-29    128456     89419   3460         258
## 4 Andorra             2021-05-29     13693     13416    127           0
## 5 Angola              2021-05-29     34180     27646    757         236
## 6 Antigua and Barbuda 2021-05-29      1259      1206     42           0
#Created a dataset including total news of COVID-19
cases_total_date <- ts_total %>%
  rename(Region = Country) %>%
  group_by(Date) %>%
  summarise(Confirmed = sum(Confirmed),
            Deaths = sum(Deaths),
            Recovered = sum(Recovered)) %>%
  mutate("New_Cases" = Confirmed - lag(Confirmed, 1))

cases_total_date$New_Cases[is.na(cases_total_date$New_Cases)] <- 0 

cases_total_latest <- cases_total_date %>%
  filter(Date == max(Date))

head(cases_total_latest)
## # A tibble: 1 × 5
##   Date       Confirmed  Deaths Recovered New_Cases
##   <date>         <dbl>   <dbl>     <dbl>     <dbl>
## 1 2021-05-29 169950839 3533604 105719754    480835
head(cases_total_date)
## # A tibble: 6 × 5
##   Date       Confirmed Deaths Recovered New_Cases
##   <date>         <dbl>  <dbl>     <dbl>     <dbl>
## 1 2020-01-22       557     17        30         0
## 2 2020-01-23       655     18        32        98
## 3 2020-01-24       941     26        39       286
## 4 2020-01-25      1433     42        42       492
## 5 2020-01-26      2118     56        56       685
## 6 2020-01-27      2927     82        65       809
codes <- codes %>%
  select(COUNTRY, CODE) %>%
  rename(Region = COUNTRY ,Code = CODE) %>%
  rownames_to_column("id")
head(codes)
## # A tibble: 6 × 3
##   id    Region         Code 
##   <chr> <chr>          <chr>
## 1 1     Afghanistan    AFG  
## 2 2     Albania        ALB  
## 3 3     Algeria        DZA  
## 4 4     American Samoa ASM  
## 5 5     Andorra        AND  
## 6 6     Angola         AGO
codes$id <- as.integer(codes$id)
head(codes)
## # A tibble: 6 × 3
##      id Region         Code 
##   <int> <chr>          <chr>
## 1     1 Afghanistan    AFG  
## 2     2 Albania        ALB  
## 3     3 Algeria        DZA  
## 4     4 American Samoa ASM  
## 5     5 Andorra        AND  
## 6     6 Angola         AGO
codes$Region <- codes$Region %>%
  str_replace(pattern = "United States", replacement = "US")
#adding country codes with cases_latest dataset
cases_latest_codes <- cases_latest %>%
  left_join(codes, by = c("Country" = "Region" )) %>%
  arrange(desc(Confirmed))
head(cases_latest_codes)
## # A tibble: 6 × 8
## # Groups:   Country [6]
##   Country Date       Confirmed Recovered Deaths `New Cases`    id Code 
##   <chr>   <date>         <dbl>     <dbl>  <dbl>       <dbl> <int> <chr>
## 1 US      2021-05-29  33251939         0 594306       11976   212 USA  
## 2 India   2021-05-29  27894800  25454320 325972      165553    93 IND  
## 3 Brazil  2021-05-29  16471600  14496224 461057       79670    28 BRA  
## 4 France  2021-05-29   5719877    390878 109518       11527    71 FRA  
## 5 Turkey  2021-05-29   5235978   5094279  47271        7656   205 TUR  
## 6 Russia  2021-05-29   4995613   4616422 118781        9155   165 RUS
cases_latest_codes %>%
  select(Country,Code, Date, Confirmed, `New Cases`, Recovered, Deaths) %>%
  arrange(desc(Confirmed)) %>%
  datatable(rownames = FALSE,options = list())
cases_all <- cases_total_date %>%
  select(-Confirmed, -New_Cases) %>%
  gather("Status", "Cases", -"Date")

barchart <- ggplot(data = cases_total_date, aes(x = Date)) +
  geom_bar(aes(y = Confirmed), position = "stack", stat = "identity", fill = "#ff5050") +
  geom_bar(data = cases_all, aes(y = Cases, fill = Status), position = "stack", stat = "identity") +
  scale_fill_manual(values = c("#000000", "#009900")) +
  scale_y_continuous(breaks = seq(0, 21000000, by = 1000000), labels = comma) +
  theme(panel.background = element_rect(fill = "White"),
        legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank()) +
  ggtitle("World COVID-19 Total Cases by Day")

ggplotly(barchart) 
cases_total_date <- cases_total_date %>%
  group_by(Date, Confirmed) %>%
  mutate(Mortality_rate = Deaths / Confirmed, Recovery_rate = Recovered / Confirmed) %>%ungroup()

barchart_1 <-cases_total_date %>%
  select(Date, Mortality_rate, Recovery_rate) %>%
  gather(status.ratio, ratio, -Date ) %>%
  ggplot(aes(x = Date, y = ratio, fill = status.ratio)) +
  geom_bar(stat = "identity", position = "dodge") +
    theme(panel.background = element_rect(fill = "White"),
          legend.position = "bottom",
          axis.title = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks = element_blank()) +
    ggtitle("The Mortality_rate and the Recovery_rate")

ggplotly(barchart_1) %>%
layout(legend = list(orientation = 'h'))
conf<- cases_total_date[,"Confirmed"]
date <- seq(from=as.Date('2020-01-22'),by=1,to = as.Date('2021-05-29'))
calendarPlot(data.frame(conf, date), pollutant = 'Confirmed', year = 2020, main = "Confirmed Cases")

rec <- cases_total_date[,"Recovered"]
calendarPlot(data.frame(rec, date), pollutant = 'Recovered', year = 2020, main = "Recovered Cases", cols = "PiYG")

det <- cases_total_date[,"Deaths"]
calendarPlot(data.frame(det, date), pollutant = 'Deaths', year = 2020, main = "Deaths", cols = "RdGy")

New_Cases <- cases_total_date[,"New_Cases"]
calendarPlot(data.frame(New_Cases, date), pollutant = 'New_Cases', year = 2020, main = "New Cases", cols = "BrBG")

cases_total_date %>%
  select(-Date) %>%
  na.omit() %>%
  cor(use = "pairwise.complete.obs") %>%
  corrplot.mixed()

top_10_confirmed <- cases_latest %>%
  select(Country, Confirmed) %>%
  arrange(desc(Confirmed))

top_10_confirmed[1:10,] %>%
  ggplot(aes(x = reorder(Country,Confirmed), y = Confirmed )) +
  geom_bar(stat = "identity", fill  = "red", width = 0.8) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "", y = "", title = "Top 10 (the Most Confirmed Cases)") +
  theme(axis.text.x = element_text(angle = 45)) +
  theme(axis.title = element_text(size = 14, colour = "black"),
        axis.text.y = element_text(size = 11, face = "bold"))

top_10_Deaths <- cases_latest %>%
  select(Country, Deaths) %>%
  arrange(desc(Deaths))

top_10_Deaths[1:10,] %>%
  ggplot(aes(x = reorder(Country,Deaths), y = Deaths )) +
  geom_bar(stat = "identity", fill  = "blue", width = 0.8) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "", y = "", title = "Top 10 (the Most Deaths)") +
  theme(axis.text.x = element_text(angle = 45)) +
  theme(axis.title = element_text(size = 14, colour = "black"),
        axis.text.y = element_text(size = 11, face = "bold"))

top_10_Recovered <- cases_latest %>%
    select(Country, Recovered) %>%
  arrange(desc(Recovered))

top_10_Recovered[1:10,] %>%
  ggplot(aes(x = reorder(Country,Recovered), y = Recovered )) +
  geom_bar(stat = "identity", fill  = "green", width = 0.8) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "", y = "", title = "Top 10 (the Most Recovered)") +
  theme(axis.text.x = element_text(angle = 45)) +
  theme(axis.title = element_text(size = 14, colour = "black"),
        axis.text.y = element_text(size = 11, face = "bold"))

head(cases_latest)
## # A tibble: 6 × 6
## # Groups:   Country [6]
##   Country             Date       Confirmed Recovered Deaths `New Cases`
##   <chr>               <date>         <dbl>     <dbl>  <dbl>       <dbl>
## 1 Afghanistan         2021-05-29     70111     57281   2899         981
## 2 Albania             2021-05-29    132297    129215   2449          12
## 3 Algeria             2021-05-29    128456     89419   3460         258
## 4 Andorra             2021-05-29     13693     13416    127           0
## 5 Angola              2021-05-29     34180     27646    757         236
## 6 Antigua and Barbuda 2021-05-29      1259      1206     42           0
options(repr.plot.width = 40, repr.plot.height = 9)

cases_latest %>%
    select(Country, Confirmed, Recovered, Deaths) %>%
    gather(key = group_var, value = "Cases", -Country, na.rm = TRUE) %>%
    group_by(Country, group_var) %>%
    summarise(n = sum(Cases), .groups = "drop_last") %>%
    arrange(desc(n)) %>%
    group_by(group_var) %>%
    slice(1:5)%>%
  
    ggplot(aes(x = Country, y = n, fill=Country)) +
    geom_bar(stat = "identity") +
    facet_grid(~ group_var, scales = "free") +
    scale_y_continuous(labels = scales::comma) +
    geom_label(aes(label=round(n/1000000, 1)), size=2, fill="white") +
    labs(title = "Top Countries per Case Type", subtitle = "Numbers in Millions") + theme(
    axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1))

top_6_affected <- ts_total %>%
  select(Country, Date, Confirmed, Deaths, Recovered) %>%
  filter(Country %in% c("US", "Spain", "Italy", "United Kingdom",
                                 "Russia","France")) %>%
  group_by(Country, Date) %>%
  summarise(Confirmed  = sum(Confirmed),
            Recovered = sum(Recovered),
            Deaths = sum(Deaths)) %>%
  mutate("New_Cases" = Confirmed - lag(Confirmed, 1),
         "Recovery_Rate" = Recovered / Confirmed ,
         "Mortality_Rate" = Deaths / Confirmed) %>%
  mutate("New_Cases" = round(New_Cases, 3),
         "Recovery_Rate" = round(Recovery_Rate, 3),
         "Mortality_Rate" = round(Mortality_Rate, 3))
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
top_6_affected$New_Cases[is.na(top_6_affected$New_Cases)] <- 0
top_6_affected$Recovery_Rate[is.nan(top_6_affected$Recovery_Rate)] <- 0
top_6_affected$Mortality_Rate[is.nan(top_6_affected$Mortality_Rate)] <- 0
top_6_affected %>%
  ggplot(aes(x = Date, y = Mortality_Rate, fill = Country)) +
  geom_bar(stat = "identity",alpha = 0.8) +
  facet_wrap(~ Country) +
  labs(x = "", y = "Mortality Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none",
        axis.title.y = element_text(face = "bold", size = 10))

COMORBIDITIES

data %>% 
  group_by(Condition.Group) %>% 
  summarise(covid_deaths = sum(COVID.19.Deaths, na.rm = TRUE)) %>% 
  mutate(relative_frequency = covid_deaths/sum(covid_deaths)) %>% 
  arrange(desc(covid_deaths))
## # A tibble: 12 × 3
##    Condition.Group                                               covid…¹ relat…²
##    <chr>                                                           <int>   <dbl>
##  1 Respiratory diseases                                           1.33e7 0.300  
##  2 COVID-19                                                       1.17e7 0.265  
##  3 Circulatory diseases                                           7.82e6 0.176  
##  4 All other conditions and causes (residual)                     4.72e6 0.107  
##  5 Diabetes                                                       1.74e6 0.0393 
##  6 Renal failure                                                  1.26e6 0.0283 
##  7 Sepsis                                                         1.15e6 0.0259 
##  8 Vascular and unspecified dementia                              8.90e5 0.0201 
##  9 Malignant neoplasms                                            5.90e5 0.0133 
## 10 Obesity                                                        5.64e5 0.0127 
## 11 Alzheimer disease                                              3.25e5 0.00732
## 12 Intentional and unintentional injury, poisoning, and other a…  2.56e5 0.00578
## # … with abbreviated variable names ¹​covid_deaths, ²​relative_frequency
data %>%
  group_by(Age.Group) %>% 
  summarise(covid_deaths = sum(COVID.19.Deaths, na.rm = TRUE)) %>% 
  mutate(relative_frequency = covid_deaths/sum(covid_deaths)) %>%
  filter(Age.Group != 'All Ages' & Age.Group != 'Not stated' )%>%
  arrange(desc(covid_deaths))
## # A tibble: 8 × 3
##   Age.Group covid_deaths relative_frequency
##   <chr>            <int>              <dbl>
## 1 75-84          5801833            0.131  
## 2 85+            5431930            0.123  
## 3 65-74          5244416            0.118  
## 4 55-64          3286716            0.0742 
## 5 45-54          1434452            0.0324 
## 6 35-44           556273            0.0126 
## 7 25-34           202749            0.00457
## 8 0-24             50935            0.00115
comorb_data <- data %>% 
  filter(Age.Group != 'All Ages' & Age.Group != 'Not stated' ) %>%
  filter(Condition.Group != 'COVID-19') %>%
  drop_na(COVID.19.Deaths) %>% 
  select(-c(Flag, Number.of.Mentions, Condition.Group ,ICD10_codes, Data.As.Of, Year, Month, State, Group)) %>% 
  rename(Age_Group = Age.Group, Covid_Deaths = COVID.19.Deaths, Start_Date = Start.Date, End_Date = End.Date) %>%  
  arrange(desc(Covid_Deaths))

final_comorb <- comorb_data %>% 
  group_by(Age_Group, Condition) %>% 
  summarise(Covid_Deaths = sum(Covid_Deaths, na.rm = T)) %>% 
  mutate(Relative_Frequency = Covid_Deaths/sum(Covid_Deaths)) %>% 
  arrange(desc(Covid_Deaths))
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
head(final_comorb)
## # A tibble: 6 × 4
## # Groups:   Age_Group [3]
##   Age_Group Condition                                  Covid_Deaths Relative_F…¹
##   <chr>     <chr>                                             <int>        <dbl>
## 1 75-84     Influenza and pneumonia                          737851        0.172
## 2 65-74     Influenza and pneumonia                          708618        0.182
## 3 85+       Influenza and pneumonia                          616583        0.157
## 4 75-84     Respiratory failure                              614542        0.143
## 5 75-84     All other conditions and causes (residual)       613078        0.143
## 6 65-74     Respiratory failure                              578116        0.148
## # … with abbreviated variable name ¹​Relative_Frequency
options(repr.plot.width=15, repr.plot.height = 10)
final_comorb %>% 
  ggplot(mapping = aes(x = Age_Group, y = Condition))+
  geom_tile(mapping = aes(fill = Covid_Deaths))

Visualizations and analysis of India data

datatable <- df %>%filter(Date == max(Date))
  
datatable %>%
  select(State.UnionTerritory, Date, Confirmed, Cured, Deaths) %>%
  arrange(desc(Confirmed)) %>%
  datatable(rownames = FALSE,options = list())
Top_6 <- 
  df %>%
  dplyr::filter(Date == max(Date)) %>%
  arrange(desc(Confirmed)) 
Top_6 <- Top_6[1:6,]

percent <- paste(Top_6$State.UnionTerritory ,round(100*Top_6$Confirmed / sum(Top_6$Confirmed), 2), "%")

Top_6 %>%
  ggplot(aes(x = "", y = Confirmed, fill = percent) ) +
  geom_bar(stat = "identity",  width = 1) +
  coord_polar(theta = "y") +
  scale_fill_brewer(palette = "Pastel1") +
  theme(panel.grid = element_blank(), panel.background = element_blank(),
       plot.title = element_text(hjust = 0.5),
       axis.text.x = element_blank()) +
  theme_economist() +
  labs(x="", y = "", title = "Confirmed Cases Ratio", fill = "") 

Top_6_all <- df %>%
  filter(State.UnionTerritory == Top_6$State.UnionTerritory)
## Warning in State.UnionTerritory == Top_6$State.UnionTerritory: longer object
## length is not a multiple of shorter object length
Top_6_all %>%
  select(State.UnionTerritory, Date, Confirmed, Cured, Deaths) %>%
  gather(status, cases, -c("State.UnionTerritory", "Date")) %>%
  ggplot()+
  geom_line(aes(Date, cases, group = status, color = status), lwd = 1) +
  labs(title="Confirmed/Recovered cases over time by top 6 State/Union Territory",color = "") +
  facet_wrap(~ State.UnionTerritory, scales = "free")+ theme_fivethirtyeight() 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

LINEAR REGRESSION

model <- lm(Deaths ~ Confirmed + Recovered + `New Cases`, data=cases_latest)
test <- summary(model)
R_sq <- test$r.squared
sprintf("The R-squared value for our model is: %f",R_sq)
## [1] "The R-squared value for our model is: 0.891188"
adj_R_sq <- test$adj.r.squared
sprintf("The adjusted R-squared value for our model is: %f",adj_R_sq)
## [1] "The adjusted R-squared value for our model is: 0.889443"

INFERENCE: The obtained adjusted R_squared value for our model is 0.889553. Therefore, our model is able to explain 88.95% of the variance.

K-FOLD CROSS-VALIDATION

fitControl <- trainControl(method = "repeatedcv",   
                           number = 10,     # number of folds
                           repeats = 10)    # repeated ten times
model.cv <- train(Deaths ~ Confirmed + Recovered + `New Cases`, data = cases_latest, method = "lasso", trControl = fitControl, preProcess = c('scale', 'center'))
model.cv
## The lasso 
## 
## 191 samples
##   3 predictor
## 
## Pre-processing: scaled (3), centered (3) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 171, 171, 172, 173, 171, 172, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE      
##   0.1       42034.81  0.9093862  22382.456
##   0.5       17624.62  0.9036615   8481.608
##   0.9       29150.02  0.8699733  10161.335
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.5.

INFERENCE: We have used number of folds as 10 and number of repetitions as 10. The data has been preprocessed (scaling and centre) before feeding into the model. The final R-squared value obtained was 0.9097 ie 90.97% with the optimum fractions being 0.5.